In [1]:
from __future__ import print_function, unicode_literals, division
from cytoolz.dicttoolz import valmap
from collections import Counter
import pandas as pd
import json
import gzip
import numpy as np
import pandas as pd
import dbpedia_config
In [2]:
target_folder = dbpedia_config.TARGET_FOLDER
First, we load a list of English stopwords. We also add some stopwords that we found on the dataset while exploring word frequency.
Note that we store a list of stopwords in the file stopwords_en.txt in our target folder (in the case of the English edition).
In [3]:
with open('{0}/stopwords_{1}.txt'.format(target_folder, dbpedia_config.MAIN_LANGUAGE), 'r') as f:
stopwords = f.read().split()
stopwords.extend('Monday Tuesday Wednesday Thursday Friday Saturday Sunday'.lower().split())
stopwords.extend('January February March April May June July August September October November December'.lower().split())
stopwords.extend('one two three four five six seven eight nine ten'.lower().split())
len(stopwords)
Out[3]:
We also load our person data.
In [4]:
person_data = pd.read_csv('{0}/person_data_en.csv.gz'.format(target_folder), encoding='utf-8', index_col='uri')
Out[4]:
In [11]:
N = person_data.gender.value_counts()
N
Out[11]:
And our vocabulary. We will consider only words that appear in both genders (so it makes sense to compare association).
In [8]:
with gzip.open('{0}/vocabulary.json.gz'.format(target_folder), 'rb') as f:
vocabulary = valmap(Counter, json.load(f))
common_words = list(set(vocabulary['male'].keys()) & set(vocabulary['female'].keys()))
len(common_words)
Out[8]:
In [20]:
def word_iter():
for w in common_words:
if w in stopwords:
continue
yield {'male': vocabulary['male'][w], 'female': vocabulary['female'][w], 'word': w}
words = pd.DataFrame.from_records(word_iter(), index='word')
Now we estimate PMI. Recall that PMI is:
$$\mbox{PMI}(c, w) = \log \frac{p(c, w)}{p(c) p(w)}$$Where c is a class (or gender) and w is a word (or bigram in our case). To normalize PMI we can divide by $-\log p(c,w)$.
In [22]:
p_c = N / N.sum()
p_c
Out[22]:
In [23]:
words['p_w'] = (words['male'] + words['female']) / N.sum()
words['p_w'].head(5)
Out[23]:
In [30]:
words['p_male_w'] = words['male'] / N.sum()
words['p_female_w'] = words['female'] / N.sum()
In [31]:
words['pmi_male'] = np.log(words['p_male_w'] / (words['p_w'] * p_c['male'])) / -np.log(words['p_male_w'])
words['pmi_female'] = np.log(words['p_female_w'] / (words['p_w'] * p_c['female'])) / -np.log(words['p_female_w'])
In [32]:
words.head()
Out[32]:
Now we are ready to explore PMI. Recall that PMI overweights words that have extremely low frequencies. We need to set a threshold for it. For instance, in our previous paper we considered 1% of biographies as threshold. But this time we have more biographies, and with 1% we don't have 200 words for women.
Hence, this time we lower the bar up to 0.1%.
In [84]:
min_p = 0.001
In [91]:
top_female = words[words.p_w > min_p].sort_values(by=['pmi_female'], ascending=False)
top_female.head(10)
Out[91]:
In [92]:
top_male = words[words.p_w > min_p].sort_values(by=['pmi_male'], ascending=False)
top_male.head(10)
Out[92]:
What we will do is to save both lists of top-200 words and then manually annotate them according to the following categories:
We will add that categorization to the column "cat", and we will process it in the following notebook.
In [93]:
top_male.head(200).to_csv('{0}/top-200-pmi-male.csv'.format(target_folder), encoding='utf-8')
top_female.head(200).to_csv('{0}/top-200-pmi-female.csv'.format(target_folder), encoding='utf-8')